{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup File for Keras Models\n",
    "Use `%run Setup.ipynb` in another notebook to perform all these tasks automatically.\n",
    "\n",
    "Parameters that can be re-configured:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_NB_WORDS = 40000 # max no. of words for tokenizer\n",
    "MAX_SEQUENCE_LENGTH = 30 # max length of text (words) including padding\n",
    "VALIDATION_SPLIT = 0.2\n",
    "EMBEDDING_DIM = 200 # embedding dimensions for word vectors (word2vec/GloVe)\n",
    "GLOVE_DIR = \"dataset/glove/glove.twitter.27B.\"+str(200)+\"d.txt\"\n",
    "print(\"[i] Loaded Parameters:\\n\",\n",
    "      MAX_NB_WORDS,MAX_SEQUENCE_LENGTH+5,\n",
    "      VALIDATION_SPLIT,EMBEDDING_DIM,\"\\n\",\n",
    "      GLOVE_DIR)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Imports:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"[i] Importing Modules...\")\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re, sys, os, csv, keras, pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras import regularizers, initializers, optimizers, callbacks\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.utils.np_utils import to_categorical\n",
    "from keras.layers import Embedding\n",
    "from keras.layers import Dense, Input, Flatten, Concatenate\n",
    "from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional\n",
    "from keras.models import Model\n",
    "from keras import backend as K\n",
    "from keras.engine.topology import Layer, InputSpec\n",
    "print(\"[+] Using Keras version\",keras.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"[+] Finished Importing Modules\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts, labels = [], []\n",
    "print(\"[i] Reading from csv file...\", end=\"\")\n",
    "with open('data.csv') as csvfile:\n",
    "    readCSV = csv.reader(csvfile, delimiter=',')\n",
    "    for row in readCSV:\n",
    "        texts.append(row[0])\n",
    "        labels.append(row[1])\n",
    "print(\"Done!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert text to word tokens (numbers that refer to the words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n",
    "tokenizer.fit_on_texts(texts)\n",
    "with open('tokenizer.pickle', 'wb') as handle:\n",
    "    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
    "print(\"[i] Saved word tokenizer to file: tokenizer.pickle\")\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('tokenizer.pickle', 'rb') as handle:\n",
    "    tokenizer = pickle.load(handle)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert tweets to sequences of word tokens with zero padding at the front and back"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequences = tokenizer.texts_to_sequences(texts)\n",
    "word_index = tokenizer.word_index\n",
    "print('[i] Found %s unique tokens.' % len(word_index))\n",
    "data_int = pad_sequences(sequences, padding='pre', maxlen=(MAX_SEQUENCE_LENGTH-5))\n",
    "data = pad_sequences(data_int, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = to_categorical(np.asarray(labels)) # convert to one-hot encoding vectors\n",
    "print('[+] Shape of data tensor:', data.shape)\n",
    "print('[+] Shape of label tensor:', labels.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "indices = np.arange(data.shape[0])\n",
    "np.random.shuffle(indices)\n",
    "data = data[indices]\n",
    "labels = labels[indices]\n",
    "nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = data[:-nb_validation_samples]\n",
    "y_train = labels[:-nb_validation_samples]\n",
    "x_val = data[-nb_validation_samples:]\n",
    "y_val = labels[-nb_validation_samples:]\n",
    "\n",
    "print('[i] Number of entries in each category:')\n",
    "print(\"[+] Training:\\n\",y_train.sum(axis=0))\n",
    "print(\"[+] Validation:\\n\",y_val.sum(axis=0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing the Embedding layer\n",
    "\n",
    "Compute an index mapping words to known embeddings, by parsing the data dump of pre-trained embeddings.\n",
    "\n",
    "We use pre-trained [GloVe](https://nlp.stanford.edu/projects/glove/) vectors from Stanford NLP. For new words, a \"randomised vector\" will be created."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings_index = {}\n",
    "f = open(GLOVE_DIR)\n",
    "print(\"[i] Loading GloVe from:\",GLOVE_DIR,\"...\",end=\"\")\n",
    "for line in f:\n",
    "    values = line.split()\n",
    "    word = values[0]\n",
    "    embeddings_index[word] = np.asarray(values[1:], dtype='float32')\n",
    "f.close()\n",
    "print(\"Done.\\n[+] Proceeding with Embedding Matrix...\", end=\"\")\n",
    "embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))\n",
    "for word, i in word_index.items():\n",
    "    embedding_vector = embeddings_index.get(word)\n",
    "    if embedding_vector is not None:\n",
    "        # words not found in embedding index will be all-zeros.\n",
    "        embedding_matrix[i] = embedding_vector\n",
    "print(\"[i] Completed!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"[i] Finished running setup.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
